#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# QQ: 34538980@qq.com
# Jekkay Hu, 2013.5.5
# ///////////////////////////////////////////////////////////////////
#                            _ooOoo_                               //
#                           o8888888o                              //
#                           88" . "88                              //
#                           (| ^_^ |)                              //
#                           O\  =  /O                              //
#                        ____/`---'\____                           //
#                      .'  \\|     |//  `.                         //
#                     /  \\|||  :  |||//  \                        //
#                    /  _||||| -:- |||||-  \                       //
#                    |   | \\\  -  /// |   |                       //
#                    | \_|  ''\---/''  |   |                       //
#                    \  .-\__  `-`  ___/-. /                       //
#                  ___`. .'  /--.--\  `. . ___                     //
#                ."" '<  `.___\_<|>_/___.'  >'"".                  //
#              | | :  `- \`.;`\ _ /`;.`/ - ` : | |                 //
#              \  \ `-.   \_ __\ /__ _/   .-` /  /                 //
#        ========`-.____`-.___\_____/___.-`____.-'========         //
#                             `=---='                              //
#        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^        //
#             佛祖保佑       永无BUG        运行正常                   //
#     -------------------------------------------------------      //
#               QQ: 34538980@qq.com                                //
#               博客: http://www.easysb.cn                          //
#               Jekkay Hu, 2013.5.5                                //
# ///////////////////////////////////////////////////////////////////
#
# 超链接的处理器
#
import re
import time
import logging
from utility.function import get_url, is_legal_domain
from utility.domain import Domain

BAIDU_FOLLOW_LINK = 'http://www.baidu.com/link?url='
MAX_FOLLOW_COUNT = 20


class LinksProcessor(object):
    # 提取所有的超连接，并且去重
    @staticmethod
    def extract_distinct_links(content):
        if not content:
            return
        links = LinksProcessor.extract_all_links(content)
        if links:
            return set(list(map(lambda x: x[0], links)))

    @staticmethod
    def extract_all_links(content):
        return re.findall('((http|ftp)s?://[a-zA-z0-9_\.\-/\?\=]+)', str(content))

    @staticmethod
    def extract_follow_link_list(content, follow=None):
        all_link_list = []
        try:
            # 去重
            links = LinksProcessor.extract_distinct_links(content)
            if not links:
                return
            # 将当前页面的去掉路径
            all_link_list.extend(set(list(map(lambda x: LinksProcessor.strip_link_path(x), links))))
            if not follow:
                return all_link_list
            max_follow_count = 0
            for link in links:
                if not link:
                    continue
                # 判断是否是follow连接
                if link.find(BAIDU_FOLLOW_LINK) >= 0:
                    # 等待1秒，不要太快，免得被百度封了
                    max_follow_count += 1
                    time.sleep(1)
                    logging.info("[百度] >>> %s" % link)
                    sub_links = LinksProcessor.extract_follow_link_list(get_url(link), False)
                    if sub_links:
                        all_link_list.extend(sub_links)
                    global MAX_FOLLOW_COUNT
                    if max_follow_count >= MAX_FOLLOW_COUNT:
                        break
        except Exception as e:
            logging.warning("extract_follow_link_list exception %s" % str(e))
        return set(all_link_list)

    @staticmethod
    def extract_domain_list(content, follow=None):
        domain_list = []
        links = LinksProcessor.extract_follow_link_list(content, follow)
        for link in links:
            d = LinksProcessor.convert_link_to_domain(link)
            if d:
                domain_list.append(d)
        return domain_list

    @staticmethod
    def convert_link_to_domain(link):
        d = Domain.convert(link)
        return d if d and is_legal_domain(d.domain) else None

    @staticmethod
    def wrap_domain(dic, title):
        d = Domain()
        d.__dict__ = dic
        d.title = title
        return d

    @staticmethod
    def strip_link_path(link):
        if not link:
            return
        s = link.find('://')
        if s <= 0:
            return
        s = link.find('/', s + len('://'))
        if s > 0:
            link = link[:s]
        # 去掉后部非常规字符
        s = link.find('\\')
        return link[:s] if s > 0 else link

